IMPORTING DATASETS
setwd("C:/Users/shubh/Desktop/Data Science Using R/Class 7 Data Science Using R")
customer_data <- read.csv("R Credit Card Case Study/Customer Acqusition.csv")
customer_data$No <- NULL
spend_data <- read.csv("R Credit Card Case Study/spend.csv")
spend_data$Sl.No. <- NULL
repay_data <- read.csv("R Credit Card Case Study/Repayment.csv")
repay_data$SL.No. <- NULL
IMPORTING PACKAGES
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(lubridate)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
require(ggplot2)
## Loading required package: ggplot2
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
RENAMING COLUMNS FOR BETTER UNDERSTANDING
repay_data <- dplyr::rename(repay_data, "Amt_Repay" = "Amount")
spend_data <- dplyr::rename(spend_data, "Amt_Spend" = "Amount")
Q1(a). REPLACE AGE WITH MEAN WHERE AGE IS LESS THAN 18
customer_data[customer_data$Age < 18, "Age"]
## [1] 16 13 15 12 16 16
customer_data[customer_data$Age < 18, "Age"] <- mean(customer_data$Age, na.rm = TRUE)
customer_data[customer_data$Age < 18, "Age"]
## numeric(0)
Q1(b). IF AMT_SPEND > LIMIT REPLACE WITH 50% OF LIMIT
customer_spend <- dplyr::left_join(x = customer_data, y = spend_data, by = c("Customer" = "Customer") )
head(customer_spend, 5)
## Customer Age City Product Limit Company Segment Month
## 1 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-12
## 2 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-03
## 3 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-15
## 4 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-25
## 5 A1 76 BANGALORE Gold 500000 C1 Self Employed 2005-01-17
## Type Amt_Spend
## 1 JEWELLERY 485470.80
## 2 PETRO 410556.13
## 3 CLOTHES 23740.46
## 4 FOOD 484342.47
## 5 CAMERA 369694.07
head(customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"], 5)
## [1] 199554.7 376887.5 439648.5 384078.7 117089.2
customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"] <-
0.5*customer_spend[customer_spend$Amt_Spend > customer_spend$Limit ,"Limit"]
head(customer_spend[customer_spend$Amt_Spend > customer_spend$Limit, "Amt_Spend"], 5)
## numeric(0)
Q1(c). IF AMT_REPAY > LIMIT REPLACE WITH LIMIT.
customer_repay <- dplyr::left_join(x = customer_data, y = repay_data, by = c("Customer" = "Customer"))
head(customer_repay, 5)
## Customer Age City Product Limit Company Segment Month
## 1 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-12
## 2 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-03
## 3 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-15
## 4 A1 76 BANGALORE Gold 500000 C1 Self Employed 2004-01-25
## 5 A1 76 BANGALORE Gold 500000 C1 Self Employed 2005-01-17
## Amt_Repay
## 1 495414.8
## 2 245899.0
## 3 259490.1
## 4 437555.1
## 5 165972.9
head(customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"], 5)
## [1] 185955.1 412783.3 148038.7 179144.7 381635.7
customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"] <-
customer_repay[customer_repay$Amt_Repay > customer_repay$Limit ,"Limit"]
head(customer_repay[customer_repay$Amt_Repay > customer_repay$Limit, "Amt_Repay"], 5)
## numeric(0)
Q2(a). NUMBER OF DISTINCT CUSTOMERS.
distinct_customers <- count(dplyr::distinct(customer_data))
print(paste("No. of distinct customers is", distinct_customers ))
## [1] "No. of distinct customers is 100"
Q2(b). NUMBER OF DISTINCT CATEGORIES(SEGMENTS).
count_categories <- count(dplyr::distinct(customer_data, Segment))
print(paste("No. of distinct categories is", count_categories ))
## [1] "No. of distinct categories is 5"
Q2(c). AVERAGE MONTHLY SPEND BY CUSTOMERS.
avg_spend_mth <- customer_spend%>% dplyr::group_by(lubridate::year(Month))%>%dplyr::summarise(AVG_SPEND = sum(Amt_Spend)/12)
## `summarise()` ungrouping output (override with `.groups` argument)
avg_spend_mth <- sum(avg_spend_mth$AVG_SPEND)/3
print(paste("Average Monthly Spend By All customers in a month is ", avg_spend_mth))
## [1] "Average Monthly Spend By All customers in a month is 6105030.14527778"
Q2(d). AVERAGE MONTHLY REPAYMENT BY CUSTOMERS.
avg_repay_mth <- customer_repay%>%dplyr::group_by(lubridate::year(Month))%>%dplyr::summarise(AVG_REPAY = sum(Amt_Repay)/12)
## `summarise()` ungrouping output (override with `.groups` argument)
avg_repay_mth <- sum(avg_repay_mth)/3
print(paste("Average Monthly Repayment By All Customers in a month is ", avg_repay_mth))
## [1] "Average Monthly Repayment By All Customers in a month is 6807070.55805555"
Q2(e). PROFIT FOR BANK FOR EACH MONTH.
monthly_spend <- customer_spend %>%
dplyr::group_by(lubridate::year(Month), lubridate::month(Month)) %>%
summarise(Amt_Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)' (override with `.groups` argument)
monthly_spend <- dplyr::rename(monthly_spend, "Year" = "lubridate::year(Month)",
"Month" = "lubridate::month(Month)")
monthly_repay <- customer_repay %>%
dplyr::group_by(lubridate::year(Month), lubridate::month(Month)) %>%
summarise(Amt_Repay = sum(Amt_Repay))
## `summarise()` regrouping output by 'lubridate::year(Month)' (override with `.groups` argument)
monthly_repay <- dplyr::rename(monthly_repay, "Year" = "lubridate::year(Month)",
"Month" = "lubridate::month(Month)")
monthly_profit <- left_join(monthly_repay, monthly_spend, by = c("Year" = "Year", "Month" = "Month"))
monthly_profit$Profit <- monthly_profit$Amt_Repay - monthly_profit$Amt_Spend
monthly_profit$Profit <- ifelse(monthly_profit$Profit > 0, monthly_profit$Profit * 0.29, 0)
monthly_profit
## # A tibble: 30 x 5
## # Groups: Year [3]
## Year Month Amt_Repay Amt_Spend Profit
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2004 1 44792021. 44279812. 148541.
## 2 2004 2 3391538. 3835952. 0
## 3 2004 3 2666519. 1871913. 230436.
## 4 2004 4 5268980. 3489415. 516074.
## 5 2004 5 3782760. 2146577. 474493.
## 6 2004 9 1902816. 2402110. 0
## 7 2004 11 1788399. 1114989. 195289.
## 8 2005 1 4894040. 5383607. 0
## 9 2005 2 25491800. 25928183. 0
## 10 2005 4 3634904. 4100066. 0
## # ... with 20 more rows
Q2(f). TOP 5 PRODUCT TYPES
top_5_products <- customer_spend %>% dplyr::group_by(customer_spend$Type) %>%
summarise(Frequency = n()) %>% arrange(desc(Frequency)) %>% head(5)
## `summarise()` ungrouping output (override with `.groups` argument)
print(top_5_products)
## # A tibble: 5 x 2
## `customer_spend$Type` Frequency
## <chr> <int>
## 1 PETRO 200
## 2 CAMERA 160
## 3 FOOD 160
## 4 AIR TICKET 147
## 5 TRAIN TICKET 132
Q2(g). CITY HAVING MAXIMUM SPEND
city_max_spend <- customer_spend %>% dplyr::group_by(customer_spend$City) %>%
summarise(Total_Spend = sum(Amt_Spend)) %>% arrange(desc(Total_Spend)) %>%
head(1)
## `summarise()` ungrouping output (override with `.groups` argument)
print(city_max_spend)
## # A tibble: 1 x 2
## `customer_spend$City` Total_Spend
## <chr> <dbl>
## 1 COCHIN 45963514.
Q2(h). EXPENDITURE ON BASIS OF AGE GROUP
customer_spend$Age_Group <- ifelse(customer_data$Age > 65, "Old(>65)",
ifelse(customer_data$Age > 50, "Senior(51-65)",
ifelse(customer_data$Age > 30, "Middle Age(31-50)",
ifelse(customer_data$Age >= 18, "Young(18-30)"))))
age_group_spend <- customer_spend %>% dplyr::group_by(Age_Group) %>%
summarise(Total_Spend = sum(Amt_Spend)) %>% arrange(desc(Total_Spend))
## `summarise()` ungrouping output (override with `.groups` argument)
print(age_group_spend)
## # A tibble: 4 x 2
## Age_Group Total_Spend
## <chr> <dbl>
## 1 Middle Age(31-50) 85833092.
## 2 Senior(51-65) 56882251.
## 3 Old(>65) 43999713.
## 4 Young(18-30) 33066029.
Q2.(i) TOP 10 CUSTOMERS IN TERMS OF REPAYMENT.
top_10_repayments <- customer_repay %>% dplyr::group_by(Customer) %>%
summarise(Total_Repayment = sum(Amt_Repay)) %>%
arrange(desc(Total_Repayment)) %>% head(10)
## `summarise()` ungrouping output (override with `.groups` argument)
print(top_10_repayments)
## # A tibble: 10 x 2
## Customer Total_Repayment
## <chr> <dbl>
## 1 A61 10539143.
## 2 A60 9876291.
## 3 A13 9572001.
## 4 A43 8489871.
## 5 A45 8448335.
## 6 A12 8334760.
## 7 A14 7943269.
## 8 A44 7744730.
## 9 A39 7622483.
## 10 A42 7615461.
Q3. CITYWISE SPEND ON EACH PRODUCT ON YEARLY BASIS.
city_analysis <-customer_spend %>% dplyr::group_by(City, Type, lubridate::year(Month)) %>%
summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'City', 'Type' (override with `.groups` argument)
city_analysis <- dplyr::rename(city_analysis, "Year" = "lubridate::year(Month)")
city_analysis <- data.frame(city_analysis)
print(head(data.frame(city_analysis), 5))
## City Type Year Spend
## 1 BANGALORE AIR TICKET 2004 749981.0
## 2 BANGALORE AIR TICKET 2005 1972296.9
## 3 BANGALORE AIR TICKET 2006 1311522.5
## 4 BANGALORE AUTO 2005 141539.8
## 5 BANGALORE AUTO 2006 822785.3
city_analysis_plot <- ggplot2::ggplot(data = city_analysis) +
aes(x = City, y = Spend, fill = Type) +
geom_bar(stat = "identity") + facet_grid(Year~.)
plot(city_analysis_plot)

city_analysis_plot <- plotly::ggplotly(city_analysis_plot)
city_analysis_plot
Q4(a). MONTHLY COMPARISON OF TOTAL SPENDS CITYWISE.
spend_citywise <- customer_spend %>%
dplyr::group_by(lubridate::year(Month),lubridate::month(Month), City) %>%
summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)', 'lubridate::month(Month)' (override with `.groups` argument)
spend_citywise <- dplyr::rename(spend_citywise, "Year" = "lubridate::year(Month)",
"Month" = "lubridate::month(Month)")
spend_citywise$Month <- lubridate::month(spend_citywise$Month, label = TRUE)
spend_citywise_plot <- ggplot2::ggplot(data = spend_citywise) +
aes(x = City, y = Spend, fill = Month) +
geom_bar(stat = "identity") + facet_grid(Year~.)
spend_citywise_plot <- plotly::ggplotly(spend_citywise_plot)
spend_citywise_plot
Q4(b). COMPARISON OF YEARLY SPEND ON AIR TICKET.
spend_airticket <- customer_spend[customer_spend$Type == "AIR TICKET", ] %>%
dplyr::group_by(lubridate::year(Month)) %>%
dplyr::summarise(Yearly_Spend = sum(Amt_Spend))
## `summarise()` ungrouping output (override with `.groups` argument)
spend_airticket <- dplyr::rename(spend_airticket, "Year" = "lubridate::year(Month)")
spend_airticket_plot <- ggplot2::ggplot(data = spend_airticket) +
aes(x = Year, y = Yearly_Spend) +
geom_bar(stat = "identity")
spend_airticket_plot <- plotly::ggplotly(spend_airticket_plot)
spend_airticket_plot
Q4(c). COMPARISON OF MONTHLY SPEND OF FOR EACH PRODUCT.
product_spend <- customer_spend %>%
dplyr::group_by(lubridate::year(Month), lubridate::month(Month), Type ) %>%
summarise(Spend = sum(Amt_Spend))
## `summarise()` regrouping output by 'lubridate::year(Month)', 'lubridate::month(Month)' (override with `.groups` argument)
product_spend <- dplyr::rename(product_spend, "Year" = "lubridate::year(Month)",
"Month" = "lubridate::month(Month)")
product_spend$Month <- lubridate::month(product_spend$Month, label = TRUE)
product_spend$Year <- as.factor(product_spend$Year)
product_spend_plot <- ggplot2::ggplot(data = product_spend) +
aes(x = Month, y = Spend, fill = Year) +
geom_bar(stat = "identity", position = "dodge") +
facet_grid(Type~.)
product_spend_plot <- plotly::ggplotly(product_spend_plot)
product_spend_plot
Q5. USER DEFINED FUNCTION FOR ANALYSIS.
# CORRECTING A MINOR MISTAKE
customer_repay[customer_repay$Product == "Platimum", "Product"] <- "Platinum"
top_10_cust_repay <- function(product, period){
data <- customer_repay[customer_repay$Product == product, ]
if(period == "year"|period == "Year"|period == "Yearly"|period == "yearly"|period == "YEAR"|period == "YEARLY")
{
data_summary <- data %>% dplyr::group_by(Customer, City, lubridate::year(Month)) %>%
summarise(Repayment = sum(Amt_Repay)) %>%
arrange(City, desc(Repayment))
data_summary <- dplyr::rename(data_summary, "Year" = "lubridate::year(Month)")
}
else if(period == "month"|period == "monthly"|period == "MONTH"|period == "MONTHLY"|period == "Month"|period == "Monthly")
{
data_summary <- data %>% dplyr::group_by(Customer, City, lubridate::month(Month)) %>%
summarise(Repayment = sum(Amt_Repay)) %>%
arrange(City, desc(Repayment))
data_summary <- dplyr::rename(data_summary, "Month" = "lubridate::month(Month)")
}
View(data_summary)
data_bangalore <- head(data_summary[data_summary$City == "BANGALORE", ], 10)
data_bombay <- head(data_summary[data_summary$City == "BOMBAY", ], 10)
data_calcutta <- head(data_summary[data_summary$City == "CALCUTTA", ], 10)
data_chennai <- head(data_summary[data_summary$City == "CHENNAI", ], 10)
data_cochin <- head(data_summary[data_summary$City == "COCHIN", ], 10)
data_delhi <- head(data_summary[data_summary$City == "DELHI", ], 10)
data_patna <- head(data_summary[data_summary$City == "PATNA", ], 10)
data_trivandrum <- head(data_summary[data_summary$City == "TRIVANDRUM", ], 10)
final_data <- rbind(data_bangalore, data_bombay, data_calcutta, data_chennai, data_cochin, data_delhi, data_patna, data_trivandrum)
if(period == "year")
{
final_data <- final_data %>% dplyr::group_by(Customer, City) %>% summarise %>% arrange(City)
}
else if(period == "month")
{
final_data <- final_data %>% dplyr::group_by(Customer, City) %>% summarise %>% arrange(City)
}
final_data
}
top_10_cust_repay("Gold", "month")
## `summarise()` regrouping output by 'Customer', 'City' (override with `.groups` argument)
## `summarise()` regrouping output by 'Customer' (override with `.groups` argument)
## # A tibble: 25 x 2
## # Groups: Customer [25]
## Customer City
## <chr> <chr>
## 1 A1 BANGALORE
## 2 A13 BANGALORE
## 3 A14 BANGALORE
## 4 A30 BANGALORE
## 5 A43 BANGALORE
## 6 A12 BOMBAY
## 7 A17 BOMBAY
## 8 A42 BOMBAY
## 9 A62 BOMBAY
## 10 A15 CALCUTTA
## # ... with 15 more rows
top_10_cust_repay("Platinum", "year")
## `summarise()` regrouping output by 'Customer', 'City' (override with `.groups` argument)
## `summarise()` regrouping output by 'Customer' (override with `.groups` argument)
## # A tibble: 24 x 2
## # Groups: Customer [24]
## Customer City
## <chr> <chr>
## 1 A19 BANGALORE
## 2 A37 BANGALORE
## 3 A5 BANGALORE
## 4 A52 BANGALORE
## 5 A33 BOMBAY
## 6 A36 BOMBAY
## 7 A4 BOMBAY
## 8 A51 BOMBAY
## 9 A20 CALCUTTA
## 10 A34 CALCUTTA
## # ... with 14 more rows